This notebook explores the use of SingleR
to perform cell-type annotation on datasets from the ScPCA project
SCPCP000007 (Gawad lab data).
Note: The first time you run this code it may take a few more minutes due to reference downloads, but they will be cached for faster future execution.
# load the R project
project_root <- here::here()
renv::load(project_root)
suppressPackageStartupMessages({
library(SingleCellExperiment)
library(SingleR)
library(celldex)
library(ggplot2)
})
theme_set(theme_bw())
set.seed(params$seed) # unclear if this is doing anything? probably not. but maybe later!
utils_dir <- file.path(project_root, "scripts", "utils")
# source the helper functions to grab the integration method check
source(file.path(utils_dir, "integration-helpers.R"))
gawad_project_id <- "SCPCP000007"
sce_file_suffix <- "processed_citeseq.rds"
Read in the data:
library_metadata_df <- readr::read_tsv(params$library_file)
## Rows: 25 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (15): project_name, submitter, library_biomaterial_id, sample_biomateria...
## lgl (1): has_CITEseq
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Define the SCE filenames
sce_file_paths <- library_metadata_df %>%
dplyr::filter(project_name == gawad_project_id) %>%
dplyr::mutate(sce_file_path = file.path(
integration_input_dir, sample_biomaterial_id, glue::glue("{library_biomaterial_id}_{sce_file_suffix}")
)) %>%
dplyr::pull(sce_file_path)
Currently the code chunks in this section are not evaluated, but remain here for now for posterity.
To build an initial sense of what we can expect annotating with
SingleR, let’s just look at one SCE (arbitrarily chosen as
the first):
# For now let's just explore one!
sce_file <- sce_file_paths[[1]]
sce <- readr::read_rds(sce_file)
sce
The celldex package contains bulk RNA-Seq datasets for
use as reference. Since this is AML data, we’ll want a reference that
has a lot of blood information. According to the celldex,
the Human Primary Cell Atlas data will be our closest
match.
The HPCA reference consists of publicly available microarray datasets derived from human primary cells (Mabbott et al. 2013). Most of the labels refer to blood subpopulations but cell types from other tissues are also available.
# define reference with ensembl IDs to match our IDs
ref_se <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)
# Predict cell types
# If we change to `labels = ref_se$label.fine`, we'll get more
# fine-grained annotations with subtypes etc.
preds_hpca <- SingleR::SingleR(
# dataset we want to annotate
test = sce,
# reference dataset
ref = ref_se,
# label.main is broad
labels = ref_se$label.main)
# Results into a tibble:
preds_df <- tibble::as_tibble(preds_hpca$labels) %>%
dplyr::rename(celltype = value) %>%
dplyr::mutate(cell_barcode = rownames(preds_hpca),
reference = "hpca")
# Save the celltypes:
hpca_celltypes <- unique(ref_se$label.main)
And a heatmap version, showing all celltypes. The bar the top shows the final assignment, which are the rows with highest scores.
SingleR::plotScoreHeatmap(preds_hpca)
But can’t hurt to see how this compares to the
Blueprint/ENCODE reference:
The Blueprint/ENCODE reference consists of bulk RNA-seq data for pure stroma and immune cells generated by Blueprint (Martens and Stunnenberg 2013) and ENCODE projects (The ENCODE Project Consortium 2012).
# define reference with ensembl IDs to match our IDs
ref_se <- celldex::BlueprintEncodeData(ensembl = TRUE)
# Predict cell types, broadly
preds_blue_enc <- SingleR::SingleR(
# dataset we want to annotate
test = sce,
# reference dataset
ref = ref_se,
# label.main is broad
labels = ref_se$label.main)
# Results into a tibble:
preds_df <- preds_df %>%
dplyr::bind_rows(
tibble::as_tibble(preds_blue_enc$labels) %>%
dplyr::rename(celltype = value) %>%
dplyr::mutate(cell_barcode = rownames(preds_blue_enc),
reference = "blueprint_encode")
)
# Save the celltypes:
blueprint_celltypes <- unique(ref_se$label.main)
# And the heatmap:
SingleR::plotScoreHeatmap(preds_blue_enc)
What did the references predict?
# How many of each celltype?
preds_df %>%
dplyr::filter(reference == "hpca") %>%
dplyr::count(celltype) %>%
dplyr::arrange(-n)
preds_df %>%
dplyr::filter(reference == "blueprint_encode") %>%
dplyr::count(celltype) %>%
dplyr::arrange(-n)
We’ll have to harmonize the celltype names between references to do a robust comparison, but from a very quick glance, overlap is thinner than one might like. That said, we don’t necessarily expect Blueprint/ENCODE to do particularly well anyways!
For a clearer comparison, we’ll harmonize predicted celltypes, but let’s just focus on exactly matching celltypes as a start -
# Let's see what we have here:
sort(hpca_celltypes)
sort(blueprint_celltypes)
# Manually compiled data rame of celltypes roughly present in BOTH references
# for now leaving the Pre/Pro B-cells out
shared_celltypes_df <- tibble::tribble(
~hpca_celltype, ~blueprint_celltype,
# Both references have it:
"Astrocyte", "Astrocytes",
"B_cell", "B-cells",
"Chondrocytes","Chondrocytes",
"DC", "DC",
"Endothelial_cells", "Endothelial cells",
"Epithelial_cells", "Epithelial cells",
"Fibroblasts", "Fibroblasts",
"Keratinocytes", "Keratinocytes",
"Macrophage", "Macrophages",
"Monocyte", "Monocytes",
"Neurons", "Neurons",
"Neutrophils", "Neutrophils",
"NK_cell", "NK cells",
"Smooth_muscle_cells", "Smooth muscle",
# two groups - collapse back to overall T cells
"T_cells", "CD8+ T-cells",
"T_cells", "CD4+ T-cells",
# two groups of HSCs, again, collapse back to overall
"HSC_-G-CSF", "HSC",
"HSC_CD34+", "HSC"
) %>%
dplyr::mutate(harmonized_celltype = ifelse(blueprint_celltype == "HSC",
blueprint_celltype,
hpca_celltype))
harmonize_celltypes <- function(preds_df, reference_name) {
if (reference_name == "hpca") {
shared_celltypes_colname <- rlang::sym("hpca_celltype")
} else {
shared_celltypes_colname <- rlang::sym("blueprint_celltype")
}
filtered_preds_df <- preds_df %>%
dplyr::filter(reference == reference_name)
sym_reference_name <- rlang::sym(reference_name)
filtered_preds_df %>%
dplyr::inner_join(
dplyr::select(
shared_celltypes_df,
celltype = {{shared_celltypes_colname}},
harmonized_celltype
)
) %>%
dplyr::select(cell_barcode, {{sym_reference_name}} := harmonized_celltype) %>%
dplyr::right_join(filtered_preds_df) %>%
dplyr::mutate({{reference_name}} := dplyr::if_else(
{{sym_reference_name}} %in% shared_celltypes_df$harmonized_celltype,
{{sym_reference_name}},
celltype
)) %>%
dplyr::select(-celltype, -reference)
}
preds_df_harmonized <- harmonize_celltypes(preds_df, "blueprint_encode") %>%
dplyr::left_join(
harmonize_celltypes(preds_df, "hpca")
)
# How often do they match?
preds_df_harmonized %>%
dplyr::summarize(percent_same = sum(blueprint_encode == hpca) / dplyr::n() )
# Where do they not match?
preds_df_harmonized %>%
dplyr::filter(hpca != blueprint_encode)
# Are there specific combinations that get differently annotated?
preds_df_harmonized %>%
dplyr::filter(hpca != blueprint_encode) %>%
dplyr::select(-cell_barcode) %>%
dplyr::count(blueprint_encode, hpca) %>%
dplyr::arrange(-n)
Now, let’s do cell type annotation with the given reference in
params$reference on each of the Gawad libraries and look at
their UMAPs colored by celltype.
if (params$reference == "hpca") {
ref_data <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)
} else if (params$reference == "blueprint_encode") {
ref_data <- celldex::BlueprintEncodeData(ensembl = TRUE)
} else {
stop("Bad reference parameter; either 'hpca' or 'blueprint_encode'.")
}
## snapshotDate(): 2022-04-26
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## snapshotDate(): 2022-04-25
## loading from cache
## require("ensembldb")
## Warning: Unable to map 1470 of 19363 requested IDs.
annotate_SingleR <- function(sce, ref_data) {
# return updated sce and the predictions themselves
preds <- SingleR::SingleR(
test = sce,
ref = ref_data,
labels = ref_data$label.main
)
# Add `pruned.labels`, where low-confidence annotations are NA, to sce
sce$SingleR_annotations <- preds$pruned.labels
return(
list(
sce = sce,
preds = preds
)
)
}
plot_SingleR <- function(annotation_output) {
# annotation_output: list of sce and preds
# Make a heatmap and UMAP and print out
heatmap <- SingleR::plotScoreHeatmap(annotation_output[["preds"]],
# default but let's be explicit
show.pruned = FALSE)
umap_df <- tibble::as_tibble(reducedDim(annotation_output[["sce"]], "UMAP")) %>%
dplyr::select(UMAP1 = V1, UMAP2 = V2) %>%
dplyr::mutate(celltypes = annotation_output[["sce"]]$SingleR_annotations)
# We would probably like a more principled approach to colors that is
# actually based on biological information about celltypes
plot_colors <- rainbow( length(unique(umap_df$celltypes[!(is.na(umap_df$celltypes))])))
umap <- ggplot(umap_df) +
aes(x = UMAP1, y = UMAP2, color = celltypes) +
geom_point(size = 0.2, alpha = 0.5) +
scale_color_manual(values = plot_colors)
print(heatmap)
print(umap)
}
# Function to run and plot SingleR
# Return SCE with annotations `SingleR_annotations` column
run_SingleR <- function(sce, viz = TRUE) {
# Print out the library
print(unique( metadata(sce)$library ))
# Run and plot (if TRUE) annotations
anno <- annotate_SingleR(sce, ref_data)
if (viz) {
# there is no interesting color palette harmonization among library colors here!
plot_SingleR(anno)
}
return(anno[["sce"]])
}
Here we go!
# Read in all SCE files
sce_list <- purrr::map(
sce_file_paths,
readr::read_rds
)
# Annotate them all, popping out some viz along the way
singler_sce_list <- purrr::map(sce_list, run_SingleR, viz = TRUE)
## [1] "SCPCL000295"
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## [1] "SCPCL000296"
## [1] "SCPCL000297"
## [1] "SCPCL000298"
## [1] "SCPCL000299"
Now let’s plot, to start, the fastMNN UMAP but with celltype annotations from individual libraries. As is deeply commented all over the place, we definitely need to infuse palettes with biology!
# helper to count the number of celltypes across all libraries
count_celltypes <- function(sce) {
tibble::tibble(celltype = colData(sce)$SingleR_annotations ) %>%
dplyr::count(celltype) %>%
dplyr::arrange(-n) %>%
tidyr::drop_na()
}
# Celltype order based on **overall** counts for all pooled annotations
celltype_order <- purrr::map_df(singler_sce_list, count_celltypes) %>%
dplyr::group_by(celltype) %>%
dplyr::summarize(celltype_counts = sum(n)) %>%
dplyr::arrange(-celltype_counts) %>%
dplyr::pull(celltype)
# Again we'd eventually like some biology to go into the color choices!
cell_colors <- rainbow(length(celltype_order))
# Read in an integrated SCE just to start getting a sense
fastmnn <- readr::read_rds(file.path("results", "scpca", "integrated_sce",
paste0(gawad_project_id, "_integrated_fastmnn_sce.rds" ))
)
# Find all the celltype annotations for a combined color palette:
extract_celltypes <- function(sce) {
tibble::tibble(batch = unique(metadata(sce)$library),
celltype = colData(sce)$SingleR_annotations,
cell_barcode = rownames(colData(sce))) %>%
dplyr::mutate(cell_name = paste(cell_barcode, batch, sep = "-"))
}
# fastMNN UMAP with individual library cell annotations colored
purrr::map_df(singler_sce_list, extract_celltypes) %>%
dplyr::inner_join(
tibble::as_tibble(colData(fastmnn), rownames = "cell_name")
) %>%
dplyr::bind_cols(
as.data.frame(reducedDim(fastmnn, "fastmnn_UMAP"))
) %>%
dplyr::mutate(UMAP1 = V1, UMAP2 = V2) %>%
# And let's make a UMAP!
ggplot() +
aes(x = UMAP1, y = UMAP2, color = celltype) +
geom_point(size = 0.2, alpha = 0.3) +
# we definitely want some biology here!
scale_color_manual(values = cell_colors)
## Joining, by = c("batch", "cell_name")
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices datasets utils methods
## [8] base
##
## other attached packages:
## [1] ensembldb_2.20.2 AnnotationFilter_1.20.0
## [3] GenomicFeatures_1.48.3 AnnotationDbi_1.58.0
## [5] magrittr_2.0.3 ggplot2_3.3.6
## [7] celldex_1.6.0 SingleR_1.10.0
## [9] SingleCellExperiment_1.18.0 SummarizedExperiment_1.26.1
## [11] Biobase_2.56.0 GenomicRanges_1.48.0
## [13] GenomeInfoDb_1.32.3 IRanges_2.30.0
## [15] S4Vectors_0.34.0 BiocGenerics_0.42.0
## [17] MatrixGenerics_1.8.1 matrixStats_0.62.0
##
## loaded via a namespace (and not attached):
## [1] AnnotationHub_3.4.0 BiocFileCache_2.4.0
## [3] lazyeval_0.2.2 splines_4.2.1
## [5] BiocParallel_1.30.3 digest_0.6.29
## [7] htmltools_0.5.3 viridis_0.6.2
## [9] fansi_1.0.3 memoise_2.0.1
## [11] ScaledMatrix_1.4.0 tzdb_0.3.0
## [13] Biostrings_2.64.0 miQC_1.4.0
## [15] readr_2.1.2 vroom_1.5.7
## [17] prettyunits_1.1.1 colorspace_2.0-3
## [19] blob_1.2.3 rappdirs_0.3.3
## [21] xfun_0.32 dplyr_1.0.9
## [23] crayon_1.5.1 RCurl_1.98-1.8
## [25] jsonlite_1.8.0 glue_1.6.2
## [27] gtable_0.3.0 zlibbioc_1.42.0
## [29] XVector_0.36.0 DelayedArray_0.22.0
## [31] BiocSingular_1.12.0 scales_1.2.0
## [33] pheatmap_1.0.12 DBI_1.1.3
## [35] Rcpp_1.0.9 viridisLite_0.4.0
## [37] xtable_1.8-4 progress_1.2.2
## [39] bit_4.0.4 rsvd_1.0.5
## [41] httr_1.4.3 RColorBrewer_1.1-3
## [43] ellipsis_0.3.2 modeltools_0.2-23
## [45] farver_2.1.1 pkgconfig_2.0.3
## [47] XML_3.99-0.10 flexmix_2.3-18
## [49] nnet_7.3-17 sass_0.4.2
## [51] dbplyr_2.2.1 utf8_1.2.2
## [53] here_1.0.1 labeling_0.4.2
## [55] tidyselect_1.1.2 rlang_1.0.4
## [57] later_1.3.0 munsell_0.5.0
## [59] BiocVersion_3.15.2 tools_4.2.1
## [61] cachem_1.0.6 cli_3.3.0
## [63] generics_0.1.3 RSQLite_2.2.15
## [65] ExperimentHub_2.4.0 evaluate_0.16
## [67] stringr_1.4.0 fastmap_1.1.0
## [69] yaml_2.3.5 knitr_1.39
## [71] bit64_4.0.5 purrr_0.3.4
## [73] KEGGREST_1.36.3 sparseMatrixStats_1.8.0
## [75] mime_0.12 xml2_1.3.3
## [77] biomaRt_2.52.0 compiler_4.2.1
## [79] rstudioapi_0.13 filelock_1.0.2
## [81] curl_4.3.2 png_0.1-7
## [83] interactiveDisplayBase_1.34.0 tibble_3.1.8
## [85] bslib_0.4.0 stringi_1.7.8
## [87] highr_0.9 lattice_0.20-45
## [89] ProtGenerics_1.28.0 Matrix_1.4-1
## [91] vctrs_0.4.1 pillar_1.8.0
## [93] lifecycle_1.0.1 BiocManager_1.30.18
## [95] jquerylib_0.1.4 BiocNeighbors_1.14.0
## [97] bitops_1.0-7 irlba_2.3.5
## [99] httpuv_1.6.5 rtracklayer_1.56.1
## [101] R6_2.5.1 BiocIO_1.6.0
## [103] promises_1.2.0.1 renv_0.15.5
## [105] gridExtra_2.3 codetools_0.2-18
## [107] assertthat_0.2.1 rprojroot_2.0.3
## [109] rjson_0.2.21 withr_2.5.0
## [111] GenomicAlignments_1.32.1 Rsamtools_2.12.0
## [113] GenomeInfoDbData_1.2.8 parallel_4.2.1
## [115] hms_1.1.1 grid_4.2.1
## [117] beachmat_2.12.0 tidyr_1.2.0
## [119] rmarkdown_2.14 DelayedMatrixStats_1.18.0
## [121] shiny_1.7.2 restfulr_0.0.15